import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
import numpy as np
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
### Primero procedemos a importar el CSV de Train, será el dataset que usemos para entrenar el modelo
### y el cual trataremos primero.
df_train = pd.read_csv ('/Users/alejandromartinruiz/Desktop/patient-survival-prediction/train.csv')
### Con head procedemos a echar un vistazo rápido a las 10 primeras filas del dataset.
df_train.head(10)
| Unnamed: 0 | encounter_id | patient_id | hospital_id | age | bmi | elective_surgery | ethnicity | gender | height | ... | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | apache_3j_bodysystem | apache_2_bodysystem | Unnamed: 83 | hospital_death | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 82707 | 36751 | 69039 | 158 | 66.0 | 35.693127 | 0 | Caucasian | F | 152.4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Respiratory | Respiratory | NaN | 0 |
| 1 | 80002 | 68336 | 127397 | 47 | 48.0 | 18.966902 | 0 | African American | F | 165.1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sepsis | Cardiovascular | NaN | 1 |
| 2 | 54817 | 71682 | 77670 | 27 | 76.0 | 19.278960 | 0 | Caucasian | M | 173.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Sepsis | Cardiovascular | NaN | 0 |
| 3 | 44799 | 114267 | 19566 | 100 | 43.0 | 45.617284 | 0 | Caucasian | M | 180.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Genitourinary | Renal/Genitourinary | NaN | 0 |
| 4 | 50277 | 128055 | 104990 | 189 | 86.0 | 26.756678 | 0 | Caucasian | F | 154.9 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | 1 |
| 5 | 71650 | 37255 | 93534 | 159 | 57.0 | 21.237500 | 1 | African American | F | 185.4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | 0 |
| 6 | 4547 | 5676 | 23546 | 77 | 58.0 | 25.593629 | 0 | Caucasian | M | 180.3 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | Neurological | Neurologic | NaN | 0 |
| 7 | 70417 | 68152 | 74832 | 92 | 85.0 | 23.336063 | 0 | Other/Unknown | M | 170.2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Respiratory | Respiratory | NaN | 0 |
| 8 | 15577 | 41770 | 43255 | 116 | 60.0 | 40.742188 | 0 | African American | F | 160.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sepsis | Cardiovascular | NaN | 0 |
| 9 | 33710 | 9676 | 93673 | 204 | 83.0 | 36.383365 | 0 | Caucasian | M | 167.6 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | 0 |
10 rows × 86 columns
df_train.shape
(64199, 86)
### Una vez importado los datos train procedemos a realizar el mismo proceso con los datos de test.
df_test = pd.read_csv('/Users/alejandromartinruiz/Desktop/patient-survival-prediction/test.csv')
### Veremos las 5 primeras filas del conjunto de test donde en el futuro aplicaremos el modelo entrenado.
df_test.head()
| Unnamed: 0 | encounter_id | patient_id | hospital_id | age | bmi | elective_surgery | ethnicity | gender | height | ... | cirrhosis | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | apache_3j_bodysystem | apache_2_bodysystem | Unnamed: 83 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68842 | 21017 | 112951 | 166 | 57.0 | 24.161722 | 1 | Caucasian | M | 193.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Trauma | Trauma | NaN |
| 1 | 60574 | 64708 | 17307 | 39 | NaN | 21.830956 | 0 | Caucasian | M | 170.2 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Trauma | Trauma | NaN |
| 2 | 16041 | 64871 | 123759 | 116 | 72.0 | 24.332277 | 0 | Caucasian | M | 188.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Musculoskeletal/Skin | Undefined Diagnoses | NaN |
| 3 | 42932 | 124400 | 47970 | 100 | 52.0 | 27.625362 | 0 | Caucasian | F | 162.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Metabolic | Metabolic | NaN |
| 4 | 90063 | 32760 | 24104 | 2 | 31.0 | 39.869524 | 1 | Caucasian | F | 152.4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Undefined diagnoses | NaN |
5 rows × 85 columns
df_test.shape
(27514, 85)
### Con este plot observamos a la variable target en nuestro dataset.
df_train['hospital_death'].value_counts().plot(kind='bar')
<AxesSubplot:>
from dash import Dash, dcc, html
import plotly.express as px
import pandas as pd
app = Dash(__name__)
fig2 = px.scatter(df_train, x='age', y='apache_3j_bodysystem',
size='patient_id',
color="hospital_death", hover_name="gender",
log_x=True, size_max=60)
app.layout = html.Div([
dcc.Graph(
id='life-exp-vs-gdp',
figure=fig2
)
])
fig2.show()
df_train["tipo_dato"] = "train"
df_test["tipo_dato"] = "test"
df_target = df_train.loc[:, ['patient_id', 'hospital_death']]
### Dropeamos la variable target del dataset de entrenamiento, igualando así el número de variables de train y test,
### teniendo ya separado la variable target en el df de target.
df_train = df_train.drop(['hospital_death'], axis=1)
df_train.head()
| Unnamed: 0 | encounter_id | patient_id | hospital_id | age | bmi | elective_surgery | ethnicity | gender | height | ... | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | apache_3j_bodysystem | apache_2_bodysystem | Unnamed: 83 | tipo_dato | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 82707 | 36751 | 69039 | 158 | 66.0 | 35.693127 | 0 | Caucasian | F | 152.4 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Respiratory | Respiratory | NaN | train |
| 1 | 80002 | 68336 | 127397 | 47 | 48.0 | 18.966902 | 0 | African American | F | 165.1 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sepsis | Cardiovascular | NaN | train |
| 2 | 54817 | 71682 | 77670 | 27 | 76.0 | 19.278960 | 0 | Caucasian | M | 173.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Sepsis | Cardiovascular | NaN | train |
| 3 | 44799 | 114267 | 19566 | 100 | 43.0 | 45.617284 | 0 | Caucasian | M | 180.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Genitourinary | Renal/Genitourinary | NaN | train |
| 4 | 50277 | 128055 | 104990 | 189 | 86.0 | 26.756678 | 0 | Caucasian | F | 154.9 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | train |
5 rows × 86 columns
df_target.shape
len(df_target.patient_id.unique())
64199
df_union = pd.concat([df_train, df_test])
### También pondremos como índice el id del paciente al ser id únicos.
df_union.set_index(['patient_id'], inplace=True)
df_union.head()
| Unnamed: 0 | encounter_id | hospital_id | age | bmi | elective_surgery | ethnicity | gender | height | icu_admit_source | ... | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | apache_3j_bodysystem | apache_2_bodysystem | Unnamed: 83 | tipo_dato | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| patient_id | |||||||||||||||||||||
| 69039 | 82707 | 36751 | 158 | 66.0 | 35.693127 | 0 | Caucasian | F | 152.4 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Respiratory | Respiratory | NaN | train |
| 127397 | 80002 | 68336 | 47 | 48.0 | 18.966902 | 0 | African American | F | 165.1 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sepsis | Cardiovascular | NaN | train |
| 77670 | 54817 | 71682 | 27 | 76.0 | 19.278960 | 0 | Caucasian | M | 173.0 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Sepsis | Cardiovascular | NaN | train |
| 19566 | 44799 | 114267 | 100 | 43.0 | 45.617284 | 0 | Caucasian | M | 180.0 | Floor | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Genitourinary | Renal/Genitourinary | NaN | train |
| 104990 | 50277 | 128055 | 189 | 86.0 | 26.756678 | 0 | Caucasian | F | 154.9 | Accident & Emergency | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Cardiovascular | Cardiovascular | NaN | train |
5 rows × 85 columns
df_union.shape
(91713, 85)
### Dividimos el dataset en variables categóricas y en variables numéricas
cat = df_union.select_dtypes(include=['object']).columns
num = df_union.select_dtypes(exclude=['object']).columns
df_union[cat].describe()
| ethnicity | gender | icu_admit_source | icu_stay_type | icu_type | apache_3j_bodysystem | apache_2_bodysystem | tipo_dato | |
|---|---|---|---|---|---|---|---|---|
| count | 90318 | 91688 | 91601 | 91713 | 91713 | 90051 | 90051 | 91713 |
| unique | 6 | 2 | 5 | 3 | 8 | 11 | 10 | 2 |
| top | Caucasian | M | Accident & Emergency | admit | Med-Surg ICU | Cardiovascular | Cardiovascular | train |
| freq | 70684 | 49469 | 54060 | 86183 | 50586 | 29999 | 38816 | 64199 |
df_union[num].describe()
| Unnamed: 0 | encounter_id | hospital_id | age | bmi | elective_surgery | height | icu_id | pre_icu_los_days | weight | ... | apache_4a_icu_death_prob | aids | cirrhosis | diabetes_mellitus | hepatic_failure | immunosuppression | leukemia | lymphoma | solid_tumor_with_metastasis | Unnamed: 83 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 91713.000000 | 91713.000000 | 91713.000000 | 87485.000000 | 88284.000000 | 91713.000000 | 90379.000000 | 91713.000000 | 91713.000000 | 88993.000000 | ... | 83766.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 90998.000000 | 0.0 |
| mean | 45856.000000 | 65606.079280 | 105.669262 | 62.309516 | 29.185818 | 0.183736 | 169.641588 | 508.357692 | 0.835766 | 84.028340 | ... | 0.043955 | 0.000857 | 0.015693 | 0.225192 | 0.012989 | 0.026165 | 0.007066 | 0.004132 | 0.020638 | NaN |
| std | 26475.406956 | 37795.088538 | 62.854406 | 16.775119 | 8.275142 | 0.387271 | 10.795378 | 228.989661 | 2.487756 | 25.011497 | ... | 0.217341 | 0.029265 | 0.124284 | 0.417711 | 0.113229 | 0.159628 | 0.083763 | 0.064148 | 0.142169 | NaN |
| min | 0.000000 | 1.000000 | 2.000000 | 16.000000 | 14.844926 | 0.000000 | 137.200000 | 82.000000 | -24.947222 | 38.600000 | ... | -1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
| 25% | 22928.000000 | 32852.000000 | 47.000000 | 52.000000 | 23.641975 | 0.000000 | 162.500000 | 369.000000 | 0.035417 | 66.800000 | ... | 0.010000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
| 50% | 45856.000000 | 65665.000000 | 109.000000 | 65.000000 | 27.654655 | 0.000000 | 170.100000 | 504.000000 | 0.138889 | 80.300000 | ... | 0.020000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
| 75% | 68784.000000 | 98342.000000 | 161.000000 | 75.000000 | 32.930206 | 0.000000 | 177.800000 | 679.000000 | 0.409028 | 97.100000 | ... | 0.060000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN |
| max | 91712.000000 | 131051.000000 | 204.000000 | 89.000000 | 67.814990 | 1.000000 | 195.590000 | 927.000000 | 159.090972 | 186.000000 | ... | 0.970000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | NaN |
8 rows × 77 columns
### Comprobamos que tipo de datos tenemos en cada variable.
df_union.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 91713 entries, 69039 to 105358 Data columns (total 85 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 91713 non-null int64 1 encounter_id 91713 non-null int64 2 hospital_id 91713 non-null int64 3 age 87485 non-null float64 4 bmi 88284 non-null float64 5 elective_surgery 91713 non-null int64 6 ethnicity 90318 non-null object 7 gender 91688 non-null object 8 height 90379 non-null float64 9 icu_admit_source 91601 non-null object 10 icu_id 91713 non-null int64 11 icu_stay_type 91713 non-null object 12 icu_type 91713 non-null object 13 pre_icu_los_days 91713 non-null float64 14 weight 88993 non-null float64 15 apache_2_diagnosis 90051 non-null float64 16 apache_3j_diagnosis 90612 non-null float64 17 apache_post_operative 91713 non-null int64 18 arf_apache 90998 non-null float64 19 gcs_eyes_apache 89812 non-null float64 20 gcs_motor_apache 89812 non-null float64 21 gcs_unable_apache 90676 non-null float64 22 gcs_verbal_apache 89812 non-null float64 23 heart_rate_apache 90835 non-null float64 24 intubated_apache 90998 non-null float64 25 map_apache 90719 non-null float64 26 resprate_apache 90479 non-null float64 27 temp_apache 87605 non-null float64 28 ventilated_apache 90998 non-null float64 29 d1_diasbp_max 91548 non-null float64 30 d1_diasbp_min 91548 non-null float64 31 d1_diasbp_noninvasive_max 90673 non-null float64 32 d1_diasbp_noninvasive_min 90673 non-null float64 33 d1_heartrate_max 91568 non-null float64 34 d1_heartrate_min 91568 non-null float64 35 d1_mbp_max 91493 non-null float64 36 d1_mbp_min 91493 non-null float64 37 d1_mbp_noninvasive_max 90234 non-null float64 38 d1_mbp_noninvasive_min 90234 non-null float64 39 d1_resprate_max 91328 non-null float64 40 d1_resprate_min 91328 non-null float64 41 d1_spo2_max 91380 non-null float64 42 d1_spo2_min 91380 non-null float64 43 d1_sysbp_max 91554 non-null float64 44 d1_sysbp_min 91554 non-null float64 45 d1_sysbp_noninvasive_max 90686 non-null float64 46 d1_sysbp_noninvasive_min 90686 non-null float64 47 d1_temp_max 89389 non-null float64 48 d1_temp_min 89389 non-null float64 49 h1_diasbp_max 88094 non-null float64 50 h1_diasbp_min 88094 non-null float64 51 h1_diasbp_noninvasive_max 84363 non-null float64 52 h1_diasbp_noninvasive_min 84363 non-null float64 53 h1_heartrate_max 88923 non-null float64 54 h1_heartrate_min 88923 non-null float64 55 h1_mbp_max 87074 non-null float64 56 h1_mbp_min 87074 non-null float64 57 h1_mbp_noninvasive_max 82629 non-null float64 58 h1_mbp_noninvasive_min 82629 non-null float64 59 h1_resprate_max 87356 non-null float64 60 h1_resprate_min 87356 non-null float64 61 h1_spo2_max 87528 non-null float64 62 h1_spo2_min 87528 non-null float64 63 h1_sysbp_max 88102 non-null float64 64 h1_sysbp_min 88102 non-null float64 65 h1_sysbp_noninvasive_max 84372 non-null float64 66 h1_sysbp_noninvasive_min 84372 non-null float64 67 d1_glucose_max 85906 non-null float64 68 d1_glucose_min 85906 non-null float64 69 d1_potassium_max 82128 non-null float64 70 d1_potassium_min 82128 non-null float64 71 apache_4a_hospital_death_prob 83766 non-null float64 72 apache_4a_icu_death_prob 83766 non-null float64 73 aids 90998 non-null float64 74 cirrhosis 90998 non-null float64 75 diabetes_mellitus 90998 non-null float64 76 hepatic_failure 90998 non-null float64 77 immunosuppression 90998 non-null float64 78 leukemia 90998 non-null float64 79 lymphoma 90998 non-null float64 80 solid_tumor_with_metastasis 90998 non-null float64 81 apache_3j_bodysystem 90051 non-null object 82 apache_2_bodysystem 90051 non-null object 83 Unnamed: 83 0 non-null float64 84 tipo_dato 91713 non-null object dtypes: float64(71), int64(6), object(8) memory usage: 60.2+ MB
### Primero he de quitar el limitador de visualización de datos para poder visualizar todas las variables.
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
df_union.isnull().sum()
Unnamed: 0 0 encounter_id 0 hospital_id 0 age 4228 bmi 3429 elective_surgery 0 ethnicity 1395 gender 25 height 1334 icu_admit_source 112 icu_id 0 icu_stay_type 0 icu_type 0 pre_icu_los_days 0 weight 2720 apache_2_diagnosis 1662 apache_3j_diagnosis 1101 apache_post_operative 0 arf_apache 715 gcs_eyes_apache 1901 gcs_motor_apache 1901 gcs_unable_apache 1037 gcs_verbal_apache 1901 heart_rate_apache 878 intubated_apache 715 map_apache 994 resprate_apache 1234 temp_apache 4108 ventilated_apache 715 d1_diasbp_max 165 d1_diasbp_min 165 d1_diasbp_noninvasive_max 1040 d1_diasbp_noninvasive_min 1040 d1_heartrate_max 145 d1_heartrate_min 145 d1_mbp_max 220 d1_mbp_min 220 d1_mbp_noninvasive_max 1479 d1_mbp_noninvasive_min 1479 d1_resprate_max 385 d1_resprate_min 385 d1_spo2_max 333 d1_spo2_min 333 d1_sysbp_max 159 d1_sysbp_min 159 d1_sysbp_noninvasive_max 1027 d1_sysbp_noninvasive_min 1027 d1_temp_max 2324 d1_temp_min 2324 h1_diasbp_max 3619 h1_diasbp_min 3619 h1_diasbp_noninvasive_max 7350 h1_diasbp_noninvasive_min 7350 h1_heartrate_max 2790 h1_heartrate_min 2790 h1_mbp_max 4639 h1_mbp_min 4639 h1_mbp_noninvasive_max 9084 h1_mbp_noninvasive_min 9084 h1_resprate_max 4357 h1_resprate_min 4357 h1_spo2_max 4185 h1_spo2_min 4185 h1_sysbp_max 3611 h1_sysbp_min 3611 h1_sysbp_noninvasive_max 7341 h1_sysbp_noninvasive_min 7341 d1_glucose_max 5807 d1_glucose_min 5807 d1_potassium_max 9585 d1_potassium_min 9585 apache_4a_hospital_death_prob 7947 apache_4a_icu_death_prob 7947 aids 715 cirrhosis 715 diabetes_mellitus 715 hepatic_failure 715 immunosuppression 715 leukemia 715 lymphoma 715 solid_tumor_with_metastasis 715 apache_3j_bodysystem 1662 apache_2_bodysystem 1662 Unnamed: 83 91713 tipo_dato 0 dtype: int64
### La variable Unnamed: 83 es una variable vacía, no aporta ningún valor al dataset,
### por lo tanto la eliminaremos totalmente.
del(df_union['Unnamed: 83'])
edad = df_union['age'].value_counts(ascending=False)
edad.head(10)
67.0 2271 68.0 2173 71.0 2141 72.0 2110 66.0 2059 65.0 2051 70.0 2032 63.0 1977 73.0 1972 64.0 1956 Name: age, dtype: int64
plt.figure(figsize=(12,6))
sns.distplot(df_union['age'].dropna(),kde=False,color='darkgreen',bins=30)
/Users/alejandromartinruiz/opt/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='age'>
## Valores nulos descartados
df_union = df_union[df_union['age'].notna()]
df_union = df_union[df_union['d1_potassium_max'].notna()]
df_union = df_union[df_union['d1_potassium_min'].notna()]
df_union = df_union[df_union['apache_2_diagnosis'].notna()]
df_union = df_union[df_union['apache_3j_diagnosis'].notna()]
df_union = df_union[df_union['heart_rate_apache'].notna()]
df_union = df_union[df_union['map_apache'].notna()]
df_union = df_union[df_union['resprate_apache'].notna()]
df_union = df_union[df_union['weight'].notna()]
df_union = df_union[df_union['h1_mbp_noninvasive_max'].notna()]
df_union = df_union[df_union['apache_4a_hospital_death_prob'].notna()]
df_union = df_union[df_union['h1_resprate_max'].notna()]
df_union = df_union[df_union['h1_spo2_max'].notna()]
df_union = df_union[df_union['d1_glucose_max'].notna()]
df_union = df_union[df_union['h1_sysbp_max'].notna()]
df_union = df_union[df_union['h1_sysbp_noninvasive_max'].notna()]
df_union = df_union[df_union['h1_heartrate_max'].notna()]
df_union = df_union[df_union['h1_diasbp_noninvasive_max'].notna()]
df_union.shape
(60499, 84)
## Valores nulos sustituidos por la media de la variable
df_union['gcs_eyes_apache']=df_union['gcs_eyes_apache'].fillna(df_union['gcs_eyes_apache'].mean())
df_union['gcs_motor_apache']=df_union['gcs_motor_apache'].fillna(df_union['gcs_motor_apache'].mean())
df_union['gcs_verbal_apache']=df_union['gcs_verbal_apache'].fillna(df_union['gcs_verbal_apache'].mean())
df_union['temp_apache']=df_union['temp_apache'].fillna(df_union['temp_apache'].mean())
df_union['d1_diasbp_max']=df_union['d1_diasbp_max'].fillna(df_union['d1_diasbp_max'].mean())
df_union['d1_diasbp_min']=df_union['d1_diasbp_min'].fillna(df_union['d1_diasbp_min'].mean())
df_union['d1_diasbp_noninvasive_max']=df_union['d1_diasbp_noninvasive_max'].fillna(df_union['d1_diasbp_noninvasive_max'].mean())
df_union['d1_diasbp_noninvasive_min']=df_union['d1_diasbp_noninvasive_min'].fillna(df_union['d1_diasbp_noninvasive_min'].mean())
df_union['d1_mbp_max']=df_union['d1_mbp_max'].fillna(df_union['d1_mbp_max'].mean())
df_union['d1_mbp_min']=df_union['d1_mbp_min'].fillna(df_union['d1_mbp_min'].mean())
df_union['d1_mbp_noninvasive_max']=df_union['d1_mbp_noninvasive_max'].fillna(df_union['d1_mbp_noninvasive_max'].mean())
df_union['d1_mbp_noninvasive_min']=df_union['d1_mbp_noninvasive_min'].fillna(df_union['d1_mbp_noninvasive_min'].mean())
df_union['d1_resprate_max']=df_union['d1_resprate_max'].fillna(df_union['d1_resprate_max'].mean())
df_union['d1_resprate_min']=df_union['d1_resprate_min'].fillna(df_union['d1_resprate_min'].mean())
df_union['bmi']=df_union['bmi'].fillna(df_union['bmi'].mean())
df_union['height']=df_union['height'].fillna(df_union['height'].mean())
df_union['d1_temp_max']=df_union['d1_temp_max'].fillna(df_union['d1_temp_max'].mean())
df_union['d1_temp_min']=df_union['d1_temp_min'].fillna(df_union['d1_temp_min'].mean())
df_union['icu_admit_source']=df_union['icu_admit_source'].fillna(method="bfill") ### siguiente fila relleno
df_union['ethnicity'].unique()
array(['Caucasian', 'Hispanic', 'African American', nan, 'Asian',
'Other/Unknown', 'Native American'], dtype=object)
df_union['ethnicity']=df_union['ethnicity'].fillna('Other/Unknown')
df_union['gender'].unique()
array(['F', 'M', nan], dtype=object)
df_union['gender']=df_union['gender'].fillna('Other')
df_union['gcs_unable_apache'].value_counts().plot(kind='bar')
<AxesSubplot:>
df_union['gcs_unable_apache']=df_union['gcs_unable_apache'].fillna(0)
df_union.isnull().sum()
Unnamed: 0 0 encounter_id 0 hospital_id 0 age 0 bmi 0 elective_surgery 0 ethnicity 0 gender 0 height 0 icu_admit_source 0 icu_id 0 icu_stay_type 0 icu_type 0 pre_icu_los_days 0 weight 0 apache_2_diagnosis 0 apache_3j_diagnosis 0 apache_post_operative 0 arf_apache 0 gcs_eyes_apache 0 gcs_motor_apache 0 gcs_unable_apache 0 gcs_verbal_apache 0 heart_rate_apache 0 intubated_apache 0 map_apache 0 resprate_apache 0 temp_apache 0 ventilated_apache 0 d1_diasbp_max 0 d1_diasbp_min 0 d1_diasbp_noninvasive_max 0 d1_diasbp_noninvasive_min 0 d1_heartrate_max 0 d1_heartrate_min 0 d1_mbp_max 0 d1_mbp_min 0 d1_mbp_noninvasive_max 0 d1_mbp_noninvasive_min 0 d1_resprate_max 0 d1_resprate_min 0 d1_spo2_max 0 d1_spo2_min 0 d1_sysbp_max 0 d1_sysbp_min 0 d1_sysbp_noninvasive_max 0 d1_sysbp_noninvasive_min 0 d1_temp_max 0 d1_temp_min 0 h1_diasbp_max 0 h1_diasbp_min 0 h1_diasbp_noninvasive_max 0 h1_diasbp_noninvasive_min 0 h1_heartrate_max 0 h1_heartrate_min 0 h1_mbp_max 0 h1_mbp_min 0 h1_mbp_noninvasive_max 0 h1_mbp_noninvasive_min 0 h1_resprate_max 0 h1_resprate_min 0 h1_spo2_max 0 h1_spo2_min 0 h1_sysbp_max 0 h1_sysbp_min 0 h1_sysbp_noninvasive_max 0 h1_sysbp_noninvasive_min 0 d1_glucose_max 0 d1_glucose_min 0 d1_potassium_max 0 d1_potassium_min 0 apache_4a_hospital_death_prob 0 apache_4a_icu_death_prob 0 aids 0 cirrhosis 0 diabetes_mellitus 0 hepatic_failure 0 immunosuppression 0 leukemia 0 lymphoma 0 solid_tumor_with_metastasis 0 apache_3j_bodysystem 0 apache_2_bodysystem 0 tipo_dato 0 dtype: int64
df_union.shape
(60499, 84)
df_union.dtypes
Unnamed: 0 int64 encounter_id int64 hospital_id int64 age float64 bmi float64 elective_surgery int64 ethnicity object gender object height float64 icu_admit_source object icu_id int64 icu_stay_type object icu_type object pre_icu_los_days float64 weight float64 apache_2_diagnosis float64 apache_3j_diagnosis float64 apache_post_operative int64 arf_apache float64 gcs_eyes_apache float64 gcs_motor_apache float64 gcs_unable_apache float64 gcs_verbal_apache float64 heart_rate_apache float64 intubated_apache float64 map_apache float64 resprate_apache float64 temp_apache float64 ventilated_apache float64 d1_diasbp_max float64 d1_diasbp_min float64 d1_diasbp_noninvasive_max float64 d1_diasbp_noninvasive_min float64 d1_heartrate_max float64 d1_heartrate_min float64 d1_mbp_max float64 d1_mbp_min float64 d1_mbp_noninvasive_max float64 d1_mbp_noninvasive_min float64 d1_resprate_max float64 d1_resprate_min float64 d1_spo2_max float64 d1_spo2_min float64 d1_sysbp_max float64 d1_sysbp_min float64 d1_sysbp_noninvasive_max float64 d1_sysbp_noninvasive_min float64 d1_temp_max float64 d1_temp_min float64 h1_diasbp_max float64 h1_diasbp_min float64 h1_diasbp_noninvasive_max float64 h1_diasbp_noninvasive_min float64 h1_heartrate_max float64 h1_heartrate_min float64 h1_mbp_max float64 h1_mbp_min float64 h1_mbp_noninvasive_max float64 h1_mbp_noninvasive_min float64 h1_resprate_max float64 h1_resprate_min float64 h1_spo2_max float64 h1_spo2_min float64 h1_sysbp_max float64 h1_sysbp_min float64 h1_sysbp_noninvasive_max float64 h1_sysbp_noninvasive_min float64 d1_glucose_max float64 d1_glucose_min float64 d1_potassium_max float64 d1_potassium_min float64 apache_4a_hospital_death_prob float64 apache_4a_icu_death_prob float64 aids float64 cirrhosis float64 diabetes_mellitus float64 hepatic_failure float64 immunosuppression float64 leukemia float64 lymphoma float64 solid_tumor_with_metastasis float64 apache_3j_bodysystem object apache_2_bodysystem object tipo_dato object dtype: object
encoder=LabelEncoder()
df_union['_Ethnicity']=encoder.fit_transform(df_union.ethnicity.values)
del(df_union['ethnicity'])
df_union['_Gender']=encoder.fit_transform(df_union.gender.values)
del(df_union['gender'])
df_union['_Icu_admit_source']=encoder.fit_transform(df_union.icu_admit_source.values)
del(df_union['icu_admit_source'])
df_union['_Icu_type']=encoder.fit_transform(df_union.icu_type.values)
del(df_union['icu_type'])
df_union['_Apache_3j_bodysystem']=encoder.fit_transform(df_union.apache_3j_bodysystem.values)
del(df_union['apache_3j_bodysystem'])
df_union['_Apache_2_bodysystem']=encoder.fit_transform(df_union.apache_2_bodysystem.values)
del(df_union['apache_2_bodysystem'])
df_union['Icu_stay_type']=encoder.fit_transform(df_union.icu_stay_type.values)
del(df_union['icu_stay_type'])
df_union.dtypes
Unnamed: 0 int64 encounter_id int64 hospital_id int64 age float64 bmi float64 elective_surgery int64 height float64 icu_id int64 pre_icu_los_days float64 weight float64 apache_2_diagnosis float64 apache_3j_diagnosis float64 apache_post_operative int64 arf_apache float64 gcs_eyes_apache float64 gcs_motor_apache float64 gcs_unable_apache float64 gcs_verbal_apache float64 heart_rate_apache float64 intubated_apache float64 map_apache float64 resprate_apache float64 temp_apache float64 ventilated_apache float64 d1_diasbp_max float64 d1_diasbp_min float64 d1_diasbp_noninvasive_max float64 d1_diasbp_noninvasive_min float64 d1_heartrate_max float64 d1_heartrate_min float64 d1_mbp_max float64 d1_mbp_min float64 d1_mbp_noninvasive_max float64 d1_mbp_noninvasive_min float64 d1_resprate_max float64 d1_resprate_min float64 d1_spo2_max float64 d1_spo2_min float64 d1_sysbp_max float64 d1_sysbp_min float64 d1_sysbp_noninvasive_max float64 d1_sysbp_noninvasive_min float64 d1_temp_max float64 d1_temp_min float64 h1_diasbp_max float64 h1_diasbp_min float64 h1_diasbp_noninvasive_max float64 h1_diasbp_noninvasive_min float64 h1_heartrate_max float64 h1_heartrate_min float64 h1_mbp_max float64 h1_mbp_min float64 h1_mbp_noninvasive_max float64 h1_mbp_noninvasive_min float64 h1_resprate_max float64 h1_resprate_min float64 h1_spo2_max float64 h1_spo2_min float64 h1_sysbp_max float64 h1_sysbp_min float64 h1_sysbp_noninvasive_max float64 h1_sysbp_noninvasive_min float64 d1_glucose_max float64 d1_glucose_min float64 d1_potassium_max float64 d1_potassium_min float64 apache_4a_hospital_death_prob float64 apache_4a_icu_death_prob float64 aids float64 cirrhosis float64 diabetes_mellitus float64 hepatic_failure float64 immunosuppression float64 leukemia float64 lymphoma float64 solid_tumor_with_metastasis float64 tipo_dato object _Ethnicity int64 _Gender int64 _Icu_admit_source int64 _Icu_type int64 _Apache_3j_bodysystem int64 _Apache_2_bodysystem int64 Icu_stay_type int64 dtype: object
list_correlacion = ['d1_diasbp_max', 'd1_diasbp_min',\
'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',\
'd1_heartrate_max',
'ventilated_apache', 'apache_4a_hospital_death_prob',\
'apache_4a_icu_death_prob']
df_relacion = df_union[list_correlacion]
correlation_mat = df_relacion.corr()
sns.heatmap(correlation_mat)
plt.show()
list_correlacion2 = ['hospital_id', 'age', 'bmi','height',\
'icu_id', 'pre_icu_los_days', 'weight',\
'apache_2_diagnosis', 'apache_3j_diagnosis','heart_rate_apache',\
'map_apache', 'resprate_apache', 'temp_apache',\
'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min',\
'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min',\
'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min',\
'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max',\
'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max',\
'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min',\
'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min',\
'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min',\
'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 'h1_mbp_min',\
'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min', 'h1_resprate_max',\
'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max',\
'h1_sysbp_min', 'h1_sysbp_noninvasive_max', 'h1_sysbp_noninvasive_min',\
'd1_glucose_max', 'd1_glucose_min', 'd1_potassium_max',\
'd1_potassium_min', 'apache_4a_hospital_death_prob',\
'apache_4a_icu_death_prob']
for i in list_correlacion2:
sns.boxplot(x=df_union[i])
print(i)
plt.show()
hospital_id
age
bmi
height
icu_id
pre_icu_los_days
weight
apache_2_diagnosis
apache_3j_diagnosis
heart_rate_apache
map_apache
resprate_apache
temp_apache
ventilated_apache
d1_diasbp_max
d1_diasbp_min
d1_diasbp_noninvasive_max
d1_diasbp_noninvasive_min
d1_heartrate_max
d1_heartrate_min
d1_mbp_max
d1_mbp_min
d1_mbp_noninvasive_max
d1_mbp_noninvasive_min
d1_resprate_max
d1_resprate_min
d1_spo2_max
d1_spo2_min
d1_sysbp_max
d1_sysbp_min
d1_sysbp_noninvasive_max
d1_sysbp_noninvasive_min
d1_temp_max
d1_temp_min
h1_diasbp_max
h1_diasbp_min
h1_diasbp_noninvasive_max
h1_diasbp_noninvasive_min
h1_heartrate_max
h1_heartrate_min
h1_mbp_max
h1_mbp_min
h1_mbp_noninvasive_max
h1_mbp_noninvasive_min
h1_resprate_max
h1_resprate_min
h1_spo2_max
h1_spo2_min
h1_sysbp_max
h1_sysbp_min
h1_sysbp_noninvasive_max
h1_sysbp_noninvasive_min
d1_glucose_max
d1_glucose_min
d1_potassium_max
d1_potassium_min
apache_4a_hospital_death_prob
apache_4a_icu_death_prob
### Tratamiento de Outliers mediante Inter-Quartile Range
def outliers(df, ft):
Q1 = df[ft].quantile(0.25)
Q3 = df[ft].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
ls = df.index[ (df[ft] < lower_bound) | (df[ft] > upper_bound)]
return ls
index_list = []
for feature in ['apache_3j_diagnosis', 'heart_rate_apache', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min']:
index_list.extend(outliers(df_union, feature))
def remove(df, ls):
ls = sorted(set(ls))
df = df.drop(ls)
return df
df_union_cleaned = remove(df_union, index_list)
df_union_cleaned.shape
(50108, 84)
### Comprobamos que no hay outliers y se han tratado correctamente
list_correlacion3 = ['apache_3j_diagnosis', 'heart_rate_apache', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min']
for i in list_correlacion3:
sns.boxplot(x=df_union_cleaned[i])
print(i)
plt.show()
apache_3j_diagnosis
heart_rate_apache
h1_resprate_min
h1_spo2_max
h1_spo2_min
df_union_cleaned.to_csv('traintest_cleaned.csv', index=False)
df_target.set_index('patient_id', inplace=True)
df_target.shape
(64199, 1)
df_target.hospital_death.unique()
array([0, 1])
df_target.head()
| hospital_death | |
|---|---|
| patient_id | |
| 69039 | 0 |
| 127397 | 1 |
| 77670 | 0 |
| 19566 | 0 |
| 104990 | 1 |
df_t = pd.merge(df_union_cleaned[df_union_cleaned['tipo_dato']=='train'], df_target, on='patient_id', how='left')
corr = abs(df_t.corr())
corr[['hospital_death']].sort_values(by = 'hospital_death',ascending = False).style.background_gradient()
| hospital_death | |
|---|---|
| hospital_death | 1.000000 |
| apache_4a_hospital_death_prob | 0.312825 |
| apache_4a_icu_death_prob | 0.290189 |
| gcs_motor_apache | 0.236107 |
| gcs_eyes_apache | 0.217731 |
| gcs_verbal_apache | 0.206479 |
| ventilated_apache | 0.203114 |
| d1_temp_min | 0.185644 |
| d1_sysbp_noninvasive_min | 0.171179 |
| d1_sysbp_min | 0.171068 |
| intubated_apache | 0.169603 |
| d1_mbp_min | 0.168117 |
| d1_mbp_noninvasive_min | 0.167658 |
| d1_spo2_min | 0.159194 |
| d1_diasbp_noninvasive_min | 0.155508 |
| d1_diasbp_min | 0.155469 |
| d1_heartrate_max | 0.142714 |
| temp_apache | 0.134585 |
| h1_mbp_noninvasive_min | 0.115227 |
| h1_mbp_min | 0.115171 |
| h1_sysbp_min | 0.110698 |
| heart_rate_apache | 0.110216 |
| h1_sysbp_noninvasive_min | 0.109611 |
| h1_diasbp_min | 0.104390 |
| h1_diasbp_noninvasive_min | 0.103991 |
| age | 0.103624 |
| h1_heartrate_max | 0.092010 |
| d1_resprate_max | 0.091717 |
| d1_potassium_max | 0.091153 |
| h1_resprate_max | 0.086964 |
| h1_resprate_min | 0.086204 |
| apache_3j_diagnosis | 0.074341 |
| apache_2_diagnosis | 0.072929 |
| h1_heartrate_min | 0.071613 |
| pre_icu_los_days | 0.068634 |
| resprate_apache | 0.067902 |
| elective_surgery | 0.067685 |
| apache_post_operative | 0.061982 |
| d1_glucose_max | 0.060597 |
| solid_tumor_with_metastasis | 0.058703 |
| _Apache_3j_bodysystem | 0.055115 |
| gcs_unable_apache | 0.049730 |
| h1_mbp_max | 0.049476 |
| h1_mbp_noninvasive_max | 0.049153 |
| immunosuppression | 0.048308 |
| d1_spo2_max | 0.046389 |
| hepatic_failure | 0.043789 |
| h1_sysbp_max | 0.042678 |
| h1_sysbp_noninvasive_max | 0.042154 |
| cirrhosis | 0.040361 |
| h1_spo2_min | 0.034760 |
| _Apache_2_bodysystem | 0.033753 |
| arf_apache | 0.030949 |
| leukemia | 0.029804 |
| d1_temp_max | 0.029103 |
| weight | 0.028952 |
| d1_heartrate_min | 0.027363 |
| map_apache | 0.026025 |
| d1_resprate_min | 0.025592 |
| h1_diasbp_noninvasive_max | 0.024962 |
| h1_diasbp_max | 0.024915 |
| h1_spo2_max | 0.024875 |
| bmi | 0.024673 |
| d1_glucose_min | 0.020542 |
| _Icu_type | 0.019278 |
| Unnamed: 0 | 0.018421 |
| d1_diasbp_max | 0.017046 |
| d1_diasbp_noninvasive_max | 0.016948 |
| lymphoma | 0.016568 |
| d1_potassium_min | 0.016451 |
| icu_id | 0.015149 |
| diabetes_mellitus | 0.011937 |
| height | 0.010377 |
| Icu_stay_type | 0.009930 |
| aids | 0.006794 |
| _Ethnicity | 0.005037 |
| d1_mbp_noninvasive_max | 0.002932 |
| _Gender | 0.002441 |
| d1_mbp_max | 0.001706 |
| d1_sysbp_noninvasive_max | 0.001129 |
| d1_sysbp_max | 0.000754 |
| hospital_id | 0.000522 |
| encounter_id | 0.000435 |
| _Icu_admit_source | 0.000187 |
del(df_t['tipo_dato'])
def proporciones_final (var,target,df):
proporcion = pd.DataFrame()
proporcion['%fugas'] = df[target].groupby(df[var]).mean()*100
proporcion['Conteo'] = df[target].groupby(df[var]).count()
proporcion= proporcion.round(3)
proporcion_filtered = proporcion[(proporcion['%fugas']>0) & (proporcion['Conteo']>10)]
if len(proporcion_filtered)<100 and len(proporcion_filtered)>1:
fig = plt.figure()
ax = proporcion_filtered['Conteo'].plot(kind='bar',grid=True)
ax2 = ax.twinx()
ax2.plot(proporcion_filtered['%fugas'].values, linestyle='-', linewidth=2.0,color='g')
plt.ylim(0, 100) # modificación.
plt.tight_layout()
else:
proporcion_filtered.reset_index(inplace=True)
sns.lmplot(x = var,y ='%fugas', data=proporcion_filtered,fit_reg=True,ci=None)
for i in df_t.columns:
proporciones_final (i, 'hospital_death', df_t)
/Users/alejandromartinruiz/opt/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py:409: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
X_train, X_test, y_train, y_test = train_test_split (df_t.drop('hospital_death', axis=1),
df_t.hospital_death,
test_size = 0.2,
random_state = 0,
stratify = df_t.hospital_death)
def saca_metricas(y_real, y_pred):
false_positive_rate, recall, thresholds = roc_curve(y_real, y_pred)
roc_auc = auc(false_positive_rate, recall)
print('- AUC: {roc_auc}')
plt.plot(false_positive_rate, recall, 'b')
plt.plot([0, 1], [0, 1], 'r--')
plt.title('AUC = %0.2f' % roc_auc)
from sklearn.tree import DecisionTreeClassifier
# Modelo y predicciones:
tree_mod = DecisionTreeClassifier(criterion="gini").fit(X_train, y_train)
tree_pred = tree_mod.predict(X_test)
# Métricas:
saca_metricas(y_test, tree_pred)
- AUC: {roc_auc}
X_train.columns
Index(['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi', 'elective_surgery', 'height', 'icu_id', 'pre_icu_los_days', 'weight', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'heart_rate_apache', 'intubated_apache', 'map_apache', 'resprate_apache', 'temp_apache', 'ventilated_apache', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min', 'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 'h1_mbp_min', 'h1_mbp_noninvasive_max',
'h1_mbp_noninvasive_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max', 'h1_sysbp_min', 'h1_sysbp_noninvasive_max', 'h1_sysbp_noninvasive_min', 'd1_glucose_max', 'd1_glucose_min', 'd1_potassium_max', 'd1_potassium_min', 'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', '_Ethnicity', '_Gender', '_Icu_admit_source', '_Icu_type', '_Apache_3j_bodysystem', '_Apache_2_bodysystem', 'Icu_stay_type'],
dtype='object')
classifier2 = RandomForestClassifier().fit(X_train, y_train)
pred2 = classifier2.predict(X_test)
saca_metricas(y_test, pred2)
- AUC: {roc_auc}
classifier3 = LinearSVC(penalty='l1', dual= False).fit(X_train, y_train)
pred3 = classifier3.predict(X_test)
/Users/alejandromartinruiz/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
saca_metricas(y_test, pred3)
- AUC: {roc_auc}
classifier4 = LogisticRegression().fit(X_train, y_train)
pred4 = classifier4.predict(X_test)
/Users/alejandromartinruiz/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
saca_metricas(y_test, pred4)
- AUC: {roc_auc}
# Validación cruzada
cv = cross_val_score(
classifier2,
X_train,
y_train,
scoring = "roc_auc",
cv = 5
)
print(cv)
print("CV ROC:", cv.mean(), np.std(cv))
[0.87454628 0.85534947 0.86192386 0.85160267 0.8598617 ] CV ROC: 0.8606567946404191 0.007814094500334757
imp = {}
for i in range(len(X_train.columns)):
imp[X_train.columns[i]] = [classifier2.feature_importances_[i]]
pd.DataFrame.from_dict(imp, orient="index", columns=["Importance"]).sort_values("Importance", ascending=False).head(10).style.background_gradient()
| Importance | |
|---|---|
| apache_4a_hospital_death_prob | 0.167694 |
| apache_4a_icu_death_prob | 0.043142 |
| d1_spo2_min | 0.029549 |
| d1_heartrate_min | 0.026500 |
| d1_temp_max | 0.023221 |
| d1_resprate_min | 0.021668 |
| bmi | 0.020878 |
| d1_glucose_max | 0.020324 |
| d1_temp_min | 0.020048 |
| d1_potassium_min | 0.019668 |
grid_param = {
'n_estimators': [100,300,500],
'criterion': ['gini', 'entropy'],
'max_depth': [16,20,22,24],
'max_features': ['auto', 'sqrt'],
}
from sklearn.model_selection import StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5,
shuffle=True,
random_state=11)
model_grid = GridSearchCV(estimator=classifier2,
param_grid=grid_param,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
model_grid.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=11, shuffle=True),
estimator=RandomForestClassifier(criterion='entropy', max_depth=16,
max_features='sqrt',
n_estimators=50),
n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [16, 20, 22, 24],
'max_features': ['auto', 'sqrt'],
'n_estimators': [100, 300, 500]},
scoring='roc_auc')
print(model_grid.best_params_)
{'criterion': 'entropy', 'max_depth': 24, 'max_features': 'auto', 'n_estimators': 500}
print(model_grid.best_score_)
0.8758220143981411
classifier2 = RandomForestClassifier(criterion = 'entropy', n_estimators= 500, max_depth = 24, max_features= 'auto').fit(X_train, y_train)
pred2 = classifier2.predict(X_test)
print(classification_report(y_test,pred2))
precision recall f1-score support
0 0.94 0.99 0.97 6493
1 0.73 0.19 0.30 530
accuracy 0.93 7023
macro avg 0.83 0.59 0.63 7023
weighted avg 0.92 0.93 0.91 7023
### Balanceo de datos para el ajuste del modelo
def run_model_balanced(X_train, X_test, y_train, y_test):
clf = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced")
clf.fit(X_train, y_train)
return clf
model = run_model_balanced(X_train, X_test, y_train, y_test)
pred_y = model.predict(X_test)
/Users/alejandromartinruiz/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/optimize.py:202: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
print(classification_report(y_test, pred_y))
precision recall f1-score support
0 0.97 0.80 0.88 6493
1 0.22 0.72 0.34 530
accuracy 0.79 7023
macro avg 0.60 0.76 0.61 7023
weighted avg 0.92 0.79 0.84 7023
df_test_definitivo = pd.merge(df_union[df_union['tipo_dato']=='test'], df_target, on='patient_id', how='left')
df_test_definitivo.drop('tipo_dato', axis=1, inplace=True)
del(df_test_definitivo['hospital_death'])
df_test_definitivo.head()
| Unnamed: 0 | encounter_id | hospital_id | age | bmi | elective_surgery | height | icu_id | pre_icu_los_days | weight | ... | leukemia | lymphoma | solid_tumor_with_metastasis | _Ethnicity | _Gender | _Icu_admit_source | _Icu_type | _Apache_3j_bodysystem | _Apache_2_bodysystem | Icu_stay_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| patient_id | |||||||||||||||||||||
| 112951 | 68842 | 21017 | 166 | 57.0 | 24.161722 | 1 | 193.0 | 675 | 0.355556 | 90.0 | ... | 0.0 | 0.0 | 0.0 | 2 | 1 | 2 | 7 | 10 | 7 | 0 |
| 123759 | 16041 | 64871 | 116 | 72.0 | 24.332277 | 0 | 188.0 | 286 | 0.143750 | 86.0 | ... | 0.0 | 0.0 | 0.0 | 2 | 1 | 0 | 3 | 6 | 8 | 0 |
| 47970 | 42932 | 124400 | 100 | 52.0 | 27.625362 | 0 | 162.0 | 495 | 0.154167 | 72.5 | ... | 0.0 | 0.0 | 0.0 | 2 | 0 | 0 | 5 | 5 | 3 | 0 |
| 73287 | 71761 | 58770 | 135 | 48.0 | 29.708393 | 0 | 152.4 | 705 | 0.050000 | 69.0 | ... | 0.0 | 0.0 | 0.0 | 5 | 0 | 0 | 5 | 9 | 0 | 0 |
| 113052 | 2018 | 71841 | 81 | 51.0 | 23.067672 | 0 | 172.7 | 90 | 0.022917 | 68.8 | ... | 0.0 | 0.0 | 0.0 | 2 | 1 | 0 | 5 | 5 | 3 | 0 |
5 rows × 83 columns
modelo_proba = classifier2.predict_proba(df_test_definitivo)[:,1]
modelo_proba = (pd.DataFrame(modelo_proba,columns=['probabilidad'],index=df_test_definitivo.index))
modelo_proba.sort_values(by=['probabilidad'],ascending=False,axis=0,inplace=True)
modelo_proba.head()
| probabilidad | |
|---|---|
| patient_id | |
| 73795 | 0.838 |
| 114972 | 0.836 |
| 110373 | 0.810 |
| 52684 | 0.808 |
| 122182 | 0.808 |
modelo_proba.tail()
| probabilidad | |
|---|---|
| patient_id | |
| 76047 | 0.0 |
| 19892 | 0.0 |
| 128140 | 0.0 |
| 44581 | 0.0 |
| 96875 | 0.0 |
imp = {}
for i in range(len(X_train.columns)):
imp[X_train.columns[i]] = [classifier2.feature_importances_[i]]
pd.DataFrame.from_dict(imp, orient="index", columns=["Importance"]).sort_values("Importance", ascending=False).head(10).style.background_gradient()
| Importance | |
|---|---|
| apache_4a_hospital_death_prob | 0.088565 |
| apache_4a_icu_death_prob | 0.072310 |
| d1_spo2_min | 0.020472 |
| d1_sysbp_noninvasive_min | 0.020262 |
| apache_3j_diagnosis | 0.020167 |
| d1_sysbp_min | 0.019580 |
| pre_icu_los_days | 0.018381 |
| age | 0.017998 |
| d1_heartrate_max | 0.017247 |
| d1_heartrate_min | 0.016912 |
df_test_definitivo.loc[73795]
Unnamed: 0 47690.000000 encounter_id 10360.000000 hospital_id 21.000000 age 61.000000 bmi 37.987896 elective_surgery 0.000000 height 172.700000 icu_id 513.000000 pre_icu_los_days 0.051389 weight 113.300000 apache_2_diagnosis 113.000000 apache_3j_diagnosis 501.050000 apache_post_operative 0.000000 arf_apache 0.000000 gcs_eyes_apache 2.000000 gcs_motor_apache 4.000000 gcs_unable_apache 0.000000 gcs_verbal_apache 1.000000 heart_rate_apache 136.000000 intubated_apache 1.000000 map_apache 191.000000 resprate_apache 31.000000 temp_apache 34.300000 ventilated_apache 1.000000 d1_diasbp_max 47.000000 d1_diasbp_min 13.000000 d1_diasbp_noninvasive_max 47.000000 d1_diasbp_noninvasive_min 13.000000 d1_heartrate_max 121.000000 d1_heartrate_min 56.000000 d1_mbp_max 60.000000 d1_mbp_min 22.000000 d1_mbp_noninvasive_max 60.000000 d1_mbp_noninvasive_min 22.000000 d1_resprate_max 30.000000 d1_resprate_min 19.000000 d1_spo2_max 100.000000 d1_spo2_min 41.000000 d1_sysbp_max 90.000000 d1_sysbp_min 41.000000 d1_sysbp_noninvasive_max 90.000000 d1_sysbp_noninvasive_min 41.030000 d1_temp_max 36.700000 d1_temp_min 34.300000 h1_diasbp_max 37.000000 h1_diasbp_min 28.000000 h1_diasbp_noninvasive_max 37.000000 h1_diasbp_noninvasive_min 28.000000 h1_heartrate_max 102.000000 h1_heartrate_min 102.000000 h1_mbp_max 49.000000 h1_mbp_min 37.000000 h1_mbp_noninvasive_max 49.000000 h1_mbp_noninvasive_min 37.000000 h1_resprate_max 27.000000 h1_resprate_min 27.000000 h1_spo2_max 90.000000 h1_spo2_min 90.000000 h1_sysbp_max 75.000000 h1_sysbp_min 63.000000 h1_sysbp_noninvasive_max 75.000000 h1_sysbp_noninvasive_min 63.000000 d1_glucose_max 150.000000 d1_glucose_min 33.000000 d1_potassium_max 6.100000 d1_potassium_min 2.400000 apache_4a_hospital_death_prob 0.750000 apache_4a_icu_death_prob 0.720000 aids 0.000000 cirrhosis 0.000000 diabetes_mellitus 0.000000 hepatic_failure 0.000000 immunosuppression 0.000000 leukemia 0.000000 lymphoma 0.000000 solid_tumor_with_metastasis 0.000000 _Ethnicity 2.000000 _Gender 1.000000 _Icu_admit_source 0.000000 _Icu_type 4.000000 _Apache_3j_bodysystem 9.000000 _Apache_2_bodysystem 0.000000 Icu_stay_type 0.000000 Name: 73795, dtype: float64
df_test_definitivo.loc[96875]
Unnamed: 0 85624.000000 encounter_id 15655.000000 hospital_id 188.000000 age 26.000000 bmi 25.808960 elective_surgery 0.000000 height 180.300000 icu_id 841.000000 pre_icu_los_days 0.272222 weight 83.900000 apache_2_diagnosis 122.000000 apache_3j_diagnosis 703.020000 apache_post_operative 0.000000 arf_apache 0.000000 gcs_eyes_apache 4.000000 gcs_motor_apache 6.000000 gcs_unable_apache 0.000000 gcs_verbal_apache 5.000000 heart_rate_apache 120.000000 intubated_apache 0.000000 map_apache 119.000000 resprate_apache 6.000000 temp_apache 36.900000 ventilated_apache 0.000000 d1_diasbp_max 111.000000 d1_diasbp_min 79.000000 d1_diasbp_noninvasive_max 111.000000 d1_diasbp_noninvasive_min 79.000000 d1_heartrate_max 120.000000 d1_heartrate_min 79.000000 d1_mbp_max 128.000000 d1_mbp_min 95.000000 d1_mbp_noninvasive_max 128.000000 d1_mbp_noninvasive_min 95.000000 d1_resprate_max 21.000000 d1_resprate_min 9.000000 d1_spo2_max 100.000000 d1_spo2_min 90.000000 d1_sysbp_max 163.000000 d1_sysbp_min 121.000000 d1_sysbp_noninvasive_max 163.000000 d1_sysbp_noninvasive_min 121.000000 d1_temp_max 37.300000 d1_temp_min 36.300000 h1_diasbp_max 94.000000 h1_diasbp_min 90.000000 h1_diasbp_noninvasive_max 94.000000 h1_diasbp_noninvasive_min 90.000000 h1_heartrate_max 106.000000 h1_heartrate_min 102.000000 h1_mbp_max 111.000000 h1_mbp_min 107.000000 h1_mbp_noninvasive_max 111.000000 h1_mbp_noninvasive_min 107.000000 h1_resprate_max 16.000000 h1_resprate_min 15.000000 h1_spo2_max 99.000000 h1_spo2_min 90.000000 h1_sysbp_max 148.000000 h1_sysbp_min 146.000000 h1_sysbp_noninvasive_max 148.000000 h1_sysbp_noninvasive_min 146.000000 d1_glucose_max 87.000000 d1_glucose_min 87.000000 d1_potassium_max 3.900000 d1_potassium_min 3.900000 apache_4a_hospital_death_prob 0.000000 apache_4a_icu_death_prob 0.000000 aids 0.000000 cirrhosis 0.000000 diabetes_mellitus 0.000000 hepatic_failure 0.000000 immunosuppression 0.000000 leukemia 0.000000 lymphoma 0.000000 solid_tumor_with_metastasis 0.000000 _Ethnicity 2.000000 _Gender 1.000000 _Icu_admit_source 0.000000 _Icu_type 7.000000 _Apache_3j_bodysystem 5.000000 _Apache_2_bodysystem 3.000000 Icu_stay_type 0.000000 Name: 70791, dtype: float64